// CuNNy 3x4C BILINEAR RGB NVL - https://github.com/funnyplanter/CuNNy

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// 
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// 
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
// 
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-D04N03
//!USE MulAdd
//!CAPABILITY FP16

#include "../StubDefs.hlsli"

//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!SAMPLER
//!FILTER POINT
SamplerState SP;

//!SAMPLER
//!FILTER LINEAR
SamplerState SL;

//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 MF4
#define M4 MF4x4

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;

//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0

#define l0(x, y) MF((dot(float3(6.094e-01, 1.148e+00, 2.568e-01), O(INPUT, float2(x, y)).rgb) + -1.542e+00))

V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
	V4 r = { -4.952e-03, -2.750e-03, -9.137e-04, 6.736e-02 };
	r = mad(s0_0, V4(-6.372e-02, 1.685e-01, -2.573e-02, -2.185e-02), r);
	r = mad(s0_1, V4(-3.502e-02, -2.984e-03, 5.048e-02, -2.445e-01), r);
	r = mad(s0_2, V4(9.644e-02, -7.557e-03, -1.770e-02, 3.162e-02), r);
	r = mad(s0_3, V4(7.199e-02, -6.233e-01, -4.180e-01, 1.392e-01), r);
	r = mad(s0_4, V4(-5.683e-01, 1.451e-01, -8.148e-02, 9.768e-02), r);
	r = mad(s0_5, V4(4.702e-01, -1.319e-03, 3.745e-03, -4.204e-02), r);
	r = mad(s0_6, V4(9.855e-03, 3.213e-01, 5.098e-01, 4.001e-02), r);
	r = mad(s0_7, V4(8.216e-02, -1.219e-02, -3.347e-02, 5.017e-02), r);
	r = mad(s0_8, V4(-6.691e-02, 5.417e-03, 1.235e-02, -9.640e-03), r);
	return r;
}

void Pass1(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	MF s0_0 = l0(-1.0, -1.0);
	MF s0_1 = l0(0.0, -1.0);
	MF s0_2 = l0(1.0, -1.0);
	MF s0_3 = l0(-1.0, 0.0);
	MF s0_4 = l0(0.0, 0.0);
	MF s0_5 = l0(1.0, 0.0);
	MF s0_6 = l0(-1.0, 1.0);
	MF s0_7 = l0(0.0, 1.0);
	MF s0_8 = l0(1.0, 1.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}

//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1

#define l0(x, y) V4(O(t0, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = { -1.511e-02, -2.848e-03, 7.160e-03, -2.555e-03 };
	r = MulAdd(s0_0, M4(3.169e-01, 3.467e-01, -2.365e-01, 2.253e-01, 6.307e-02, 1.727e-01, -1.053e-01, 9.324e-02, -4.901e-02, -2.112e-01, 8.983e-02, -1.851e-01, -1.987e-01, 6.645e-02, 2.188e-02, 1.988e-02), r);
	r = MulAdd(s0_1, M4(4.393e-02, 2.078e-01, -1.967e-01, 4.673e-02, -7.991e-02, 2.461e-01, -6.028e-02, 9.252e-02, 3.871e-01, 6.138e-02, -3.603e-01, -1.485e-01, 2.466e-01, 5.251e-02, -6.181e-02, 8.932e-02), r);
	r = MulAdd(s0_2, M4(-1.707e-02, 2.598e-02, 1.641e-02, 2.780e-02, 2.425e-02, 1.769e-01, -8.461e-02, 1.067e-01, -2.503e-01, 6.051e-01, -2.782e-01, 1.311e-01, -8.456e-03, -1.370e-02, -6.391e-02, 6.935e-02), r);
	r = MulAdd(s0_3, M4(-8.251e-01, -4.981e-01, -1.726e-01, -1.815e-01, 1.411e-01, 2.889e-02, -3.115e-01, -3.255e-01, 1.812e-03, -4.529e-02, 2.350e-01, 1.999e-01, -1.993e-01, -1.868e-02, 4.249e-02, -1.117e-01), r);
	r = MulAdd(s0_4, M4(-4.732e-02, -5.673e-02, 1.274e-01, 4.894e-02, 9.126e-02, 1.717e-01, -3.294e-01, -2.378e-01, -7.089e-02, -8.116e-02, 2.510e-01, 7.381e-02, 1.275e-01, 8.030e-02, -1.671e-01, -1.824e-02), r);
	r = MulAdd(s0_5, M4(3.373e-02, -4.163e-02, -4.077e-02, -2.085e-02, 1.265e-01, -4.133e-01, 7.433e-02, 7.763e-02, -1.466e-01, 3.291e-01, -7.784e-02, 9.472e-02, 2.725e-01, -2.393e-01, -6.913e-02, -9.445e-02), r);
	r = MulAdd(s0_6, M4(3.043e-02, -9.985e-02, 1.538e-01, -2.529e-01, 2.379e-01, 1.079e-01, -1.517e-01, -9.289e-02, -1.396e-01, -4.354e-02, 8.463e-02, 7.052e-02, 5.629e-02, 3.293e-03, 5.342e-02, -1.606e-01), r);
	r = MulAdd(s0_7, M4(3.626e-02, -1.421e-01, 4.017e-02, -3.963e-02, 2.148e-03, 5.522e-02, 3.174e-01, 2.270e-02, -5.590e-02, -9.875e-02, -1.683e-01, 5.415e-02, 1.509e-01, 7.709e-02, -1.161e-01, 1.440e-01), r);
	r = MulAdd(s0_8, M4(-1.132e-02, 2.337e-02, 1.264e-02, 2.638e-03, -6.582e-02, -1.965e-01, 2.803e-01, 1.333e-01, 9.171e-02, 1.567e-01, -2.419e-01, -1.602e-01, -2.271e-01, 3.614e-02, 2.179e-01, 4.826e-02), r);
	r = MulAdd(s1_0, M4(1.452e-01, 1.313e-01, -6.140e-02, 2.412e-01, -3.691e-02, 7.355e-02, -4.209e-02, 1.343e-01, -2.509e-02, -1.266e-01, 9.017e-02, -1.854e-02, -4.280e-01, -1.004e-01, 2.319e-01, 4.211e-02), r);
	r = MulAdd(s1_1, M4(4.894e-02, 7.564e-02, -9.350e-02, 5.422e-02, -6.111e-02, 6.969e-02, -4.398e-02, 6.622e-02, 7.113e-01, 3.461e-01, -5.254e-01, -8.808e-02, 4.481e-01, 3.171e-01, -2.198e-01, 1.048e-01), r);
	r = MulAdd(s1_2, M4(-3.483e-02, 3.150e-03, 2.215e-02, 2.616e-02, 1.468e-01, -1.295e-01, -1.470e-01, 3.371e-02, -4.514e-02, 4.677e-02, -1.313e-01, -1.176e-01, 1.507e-03, 2.290e-01, -2.163e-01, 3.895e-02), r);
	r = MulAdd(s1_3, M4(-2.258e-01, -1.353e-01, -4.873e-01, -1.236e+00, 1.660e-01, -1.803e-02, -2.797e-01, -4.092e-01, -1.525e-01, -8.178e-02, 2.665e-01, 3.652e-01, -1.853e-01, -3.819e-02, 1.627e-01, -3.896e-01), r);
	r = MulAdd(s1_4, M4(-1.005e-01, -3.821e-02, 9.917e-02, -1.324e-01, -2.040e-01, -3.586e-01, 9.776e-02, -1.376e-01, 2.065e-01, 2.017e-01, -1.320e-01, -2.225e-02, 2.944e-01, 5.393e-02, -4.301e-01, -7.240e-02), r);
	r = MulAdd(s1_5, M4(5.353e-02, -4.257e-02, -4.131e-02, -3.943e-02, -6.151e-02, 3.059e-01, -1.481e-02, 3.662e-01, 3.098e-02, -8.774e-02, 1.790e-02, -1.332e-01, 8.670e-02, -6.985e-02, -1.359e-01, 2.063e-01), r);
	r = MulAdd(s1_6, M4(-9.271e-02, 2.259e-01, 2.200e-02, -2.390e-01, 3.258e-01, 1.082e-01, -1.499e-01, -3.063e-02, -2.775e-01, -9.008e-02, 1.294e-01, 3.533e-02, 1.011e-02, 4.294e-02, 4.935e-02, -1.005e-01), r);
	r = MulAdd(s1_7, M4(1.321e-02, -7.160e-02, 7.229e-02, -3.050e-02, 4.303e-02, -1.518e-01, 5.137e-01, 4.029e-02, 4.896e-02, 5.334e-02, -3.545e-01, 2.370e-02, 1.645e-01, 3.433e-02, -9.552e-03, 1.032e-01), r);
	r = MulAdd(s1_8, M4(8.370e-03, -2.408e-02, 2.693e-02, -8.183e-03, -2.375e-02, -2.973e-01, 1.889e-01, 1.096e-01, 1.093e-02, 2.310e-01, -1.613e-01, -1.343e-01, -1.718e-01, -2.165e-02, 1.384e-01, 9.956e-02), r);
	return r;
}

void Pass2(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0

#define l0(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = { -3.514e-03, 2.350e-03, 2.221e-03, 1.089e-03 };
	r = MulAdd(s0_0, M4(6.983e-02, 8.935e-03, -1.644e-01, -4.232e-04, -1.981e-01, 9.265e-02, 1.769e-01, 1.705e-01, -2.300e-02, -7.408e-03, -4.221e-02, -1.617e-02, -6.026e-02, -9.185e-03, -7.420e-02, -4.238e-02), r);
	r = MulAdd(s0_1, M4(1.832e-01, -1.117e-01, 1.784e-02, 6.345e-02, -9.651e-02, 5.753e-02, 1.480e-01, 1.284e-01, 3.957e-01, -2.684e-01, 2.853e-02, -5.823e-02, -8.184e-02, 1.062e-01, -2.604e-02, -7.579e-02), r);
	r = MulAdd(s0_2, M4(-1.753e-01, 5.019e-03, -1.285e-01, 8.470e-02, -2.566e-01, 6.556e-02, -9.751e-02, 7.653e-03, -9.466e-02, 3.098e-02, -9.617e-02, -4.826e-02, 3.951e-02, -5.446e-02, 1.297e-01, 1.076e-01), r);
	r = MulAdd(s0_3, M4(-7.377e-02, -2.183e-01, 9.806e-02, 1.735e-01, 2.795e-01, 3.730e-01, 1.906e-01, 1.313e-01, 2.115e-01, 2.222e-01, 1.880e-01, 2.427e-01, -1.177e-01, 2.587e-02, -1.928e-01, -1.489e-01), r);
	r = MulAdd(s0_4, M4(-3.487e-01, -3.194e-01, 7.963e-01, -1.044e-01, 3.136e-01, -5.467e-02, 5.059e-01, -4.801e-02, -4.943e-01, -1.466e-01, -5.938e-02, -9.473e-01, 2.661e-01, -1.545e-01, 1.986e-01, -2.172e-02), r);
	r = MulAdd(s0_5, M4(-3.450e-01, 1.931e-01, -2.303e-01, -1.880e-01, -1.323e-01, 1.839e-01, -1.130e-01, -5.181e-02, 3.049e-02, 9.834e-02, -1.342e-01, -1.072e-01, 1.925e-02, -9.652e-02, 1.169e-01, 2.084e-01), r);
	r = MulAdd(s0_6, M4(1.543e-02, 2.202e-01, 4.809e-02, 1.085e-01, 3.076e-02, -4.127e-01, 4.606e-02, 9.444e-02, 7.886e-02, -1.314e-01, -1.638e-02, 4.353e-02, 9.790e-02, -6.783e-02, -1.008e-01, -1.558e-01), r);
	r = MulAdd(s0_7, M4(-4.453e-02, 3.133e-01, -2.217e-01, -5.271e-02, -2.055e-01, -1.000e-01, 8.374e-02, 6.141e-02, 2.147e-02, -3.844e-01, -2.203e-01, -1.105e-01, -3.596e-02, 2.026e-01, 3.174e-01, 1.519e-01), r);
	r = MulAdd(s0_8, M4(-5.107e-03, 2.380e-01, 2.147e-02, -8.032e-02, -9.743e-02, 6.943e-02, 9.403e-02, 3.742e-02, -1.822e-02, -4.950e-02, 7.963e-02, -1.338e-01, -1.491e-01, 1.655e-02, -5.817e-02, 1.164e-01), r);
	r = MulAdd(s1_0, M4(8.679e-02, -7.335e-02, -5.999e-02, -4.504e-02, -3.329e-02, 4.349e-03, -4.883e-02, 3.159e-02, -7.948e-02, 3.308e-02, 6.579e-02, 1.607e-01, 1.336e-01, -1.042e-01, -2.368e-01, -1.546e-01), r);
	r = MulAdd(s1_1, M4(2.764e-01, -6.665e-02, 1.661e-02, -4.103e-02, 1.095e-01, -1.159e-01, -1.142e-01, -1.412e-01, 4.033e-01, -8.697e-02, 2.387e-01, 1.762e-01, 4.948e-01, -1.533e-01, 7.816e-02, 5.700e-02), r);
	r = MulAdd(s1_2, M4(1.187e-01, -6.571e-02, 4.698e-02, 4.931e-02, -5.523e-02, 3.925e-02, -7.453e-02, -8.429e-02, -2.202e-01, 6.090e-02, -1.460e-01, 2.777e-02, 4.405e-01, 6.445e-03, 3.494e-01, 3.311e-01), r);
	r = MulAdd(s1_3, M4(-4.333e-02, -8.517e-02, 1.372e-01, 2.066e-01, 4.728e-01, 1.195e-01, -2.627e-01, -2.280e-01, 1.606e-01, 2.216e-01, 2.269e-01, 3.505e-01, -2.499e-01, -3.977e-01, -3.659e-02, 1.460e-02), r);
	r = MulAdd(s1_4, M4(-4.640e-01, -7.221e-01, -2.524e-01, -6.513e-01, 6.699e-01, -1.727e-01, 4.444e-01, -3.115e-01, -6.748e-01, 1.063e-01, 6.487e-01, -3.195e-01, -5.136e-01, -8.272e-01, 4.014e-01, 4.914e-01), r);
	r = MulAdd(s1_5, M4(-1.112e-03, -1.293e-02, 1.567e-02, -1.266e-01, 1.185e-01, 4.940e-02, -9.925e-02, -1.034e-01, -1.041e-01, 1.822e-01, -4.277e-02, 1.313e-01, -6.459e-01, -1.562e-01, -3.961e-01, -7.262e-02), r);
	r = MulAdd(s1_6, M4(1.499e-02, 3.135e-01, 2.187e-01, 2.386e-01, 1.171e-01, -4.899e-01, -1.987e-01, -1.717e-01, 5.232e-02, -1.984e-01, 9.338e-04, 1.092e-01, 1.545e-01, 4.183e-01, 1.180e-01, 1.102e-01), r);
	r = MulAdd(s1_7, M4(-1.411e-01, 2.619e-01, -2.549e-01, -2.113e-01, -1.109e-01, -3.038e-01, 7.579e-02, -3.585e-02, -1.373e-03, -2.713e-01, -5.527e-02, 7.052e-02, -1.648e-01, 7.324e-01, 3.974e-01, 2.306e-01), r);
	r = MulAdd(s1_8, M4(-1.861e-02, 9.414e-02, -6.739e-02, -8.921e-02, -2.337e-02, -2.657e-02, -3.376e-03, -7.209e-02, -1.042e-01, -2.504e-02, 1.287e-01, -1.459e-02, -1.617e-01, 2.384e-01, -6.969e-01, -3.760e-01), r);
	return r;
}

void Pass3(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1

#define l0(x, y) V4(O(t0, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = { -4.349e-03, -3.760e-03, 4.684e-03, 4.745e-03 };
	r = MulAdd(s0_0, M4(1.869e-01, 8.774e-02, -6.451e-02, 6.682e-02, 8.374e-02, 1.313e-02, -2.649e-02, 2.741e-02, -3.609e-02, -9.330e-02, -8.233e-02, 1.117e-01, -1.203e-01, 1.719e-02, 1.288e-01, -9.851e-02), r);
	r = MulAdd(s0_1, M4(3.100e-01, 5.063e-02, 1.169e-01, -3.828e-02, 3.428e-01, 4.869e-02, -1.232e-02, -1.003e-02, 2.756e-01, 3.916e-01, 1.450e-01, 1.078e-01, -2.568e-01, -2.157e-01, -1.057e-01, -1.338e-01), r);
	r = MulAdd(s0_2, M4(1.199e-01, -1.890e-01, 5.870e-03, -5.995e-03, 2.255e-01, -2.325e-03, 7.916e-03, -2.038e-02, 1.353e-01, -9.590e-02, -2.119e-02, -5.860e-02, -7.698e-02, -3.608e-02, -3.571e-02, 2.010e-02), r);
	r = MulAdd(s0_3, M4(9.889e-02, -2.665e-02, -2.627e-01, 3.583e-01, 7.891e-02, 8.737e-02, 5.322e-02, 5.246e-04, -5.188e-02, -8.491e-02, -4.991e-02, -3.735e-02, 5.711e-02, 4.482e-02, 5.660e-02, -1.322e-01), r);
	r = MulAdd(s0_4, M4(-5.488e-01, 2.898e-01, 1.046e+00, 6.036e-01, -3.180e-01, -6.309e-01, -2.627e-01, 1.734e-01, -2.067e-01, 3.775e-02, -2.881e-01, -9.242e-02, 3.369e-01, 2.554e-02, -1.645e-01, 4.973e-01), r);
	r = MulAdd(s0_5, M4(6.976e-03, -1.830e-01, 2.842e-01, 2.570e-02, -2.902e-01, 5.059e-01, 1.944e-01, 1.794e-02, -1.333e-01, 2.341e-01, 4.161e-01, -5.179e-02, 8.176e-02, -2.435e-02, -1.598e-02, 6.211e-02), r);
	r = MulAdd(s0_6, M4(-2.668e-02, -6.958e-02, -5.015e-02, 8.035e-02, 4.451e-02, -1.290e-03, -7.688e-02, 1.708e-01, -5.133e-02, -2.768e-02, -1.780e-02, -6.317e-02, -9.692e-03, -2.748e-03, 9.070e-03, -1.314e-01), r);
	r = MulAdd(s0_7, M4(1.402e-01, 4.997e-02, -4.973e-02, 6.839e-01, 2.079e-02, -2.511e-02, 3.403e-01, -3.077e-01, -2.831e-02, 4.816e-02, -9.142e-02, -8.176e-02, -2.999e-02, -5.749e-03, -5.579e-02, -2.355e-01), r);
	r = MulAdd(s0_8, M4(-1.783e-02, -2.882e-02, 9.841e-02, 4.473e-02, 4.128e-02, -3.071e-02, -2.378e-01, 1.347e-01, -2.285e-02, 1.317e-02, -1.632e-02, 1.058e-01, -3.696e-02, -6.864e-03, -8.989e-02, -7.315e-02), r);
	r = MulAdd(s1_0, M4(8.857e-02, 3.169e-02, -1.896e-02, 1.258e-02, 7.086e-02, 5.699e-02, 1.550e-02, -1.836e-02, 1.209e-01, 5.334e-02, -1.557e-02, -2.374e-02, -1.411e-02, 1.543e-02, 1.769e-02, -4.332e-02), r);
	r = MulAdd(s1_1, M4(1.199e-01, -8.203e-03, -1.695e-02, -3.214e-02, 5.918e-01, 3.458e-01, 7.684e-02, -5.137e-01, 2.827e-01, -2.008e-02, -1.848e-01, 2.147e-01, 7.212e-02, -3.906e-03, -2.220e-01, -1.918e-01), r);
	r = MulAdd(s1_2, M4(4.464e-02, 4.035e-02, 4.265e-03, 1.350e-02, -4.623e-01, -1.882e-01, 9.929e-02, -2.295e-01, 2.010e-01, 6.059e-01, 3.648e-01, -1.670e-02, -6.763e-02, -2.588e-01, -1.741e-01, 3.358e-02), r);
	r = MulAdd(s1_3, M4(1.003e-01, -2.961e-02, -1.715e-01, 1.057e-01, 3.275e-03, 1.877e-02, -4.995e-02, 1.181e-01, 3.600e-02, 2.101e-02, -1.050e-01, 8.035e-02, -8.107e-02, -1.067e-01, -5.457e-02, 5.339e-02), r);
	r = MulAdd(s1_4, M4(3.875e-01, 3.638e-01, 1.178e-01, -4.404e-02, 6.128e-02, -1.193e-01, -3.161e-01, 3.510e-01, -3.482e-02, -2.842e-01, -3.917e-01, 4.525e-01, 1.969e-01, 5.299e-01, 4.720e-01, -2.266e-01), r);
	r = MulAdd(s1_5, M4(-1.420e-02, 2.325e-02, -8.697e-02, -4.296e-03, 8.697e-02, 7.490e-02, 1.773e-01, 4.010e-01, 2.380e-01, -1.182e-01, 9.121e-01, 2.252e-01, 1.348e-01, -7.448e-02, -8.496e-01, -3.335e-01), r);
	r = MulAdd(s1_6, M4(-7.923e-02, -2.533e-02, -4.896e-02, -5.473e-02, -5.329e-03, 1.285e-02, -1.763e-02, 7.009e-02, 9.670e-04, -1.889e-02, -1.008e-01, 1.149e-01, 7.259e-03, 4.080e-02, 1.042e-01, -2.627e-01), r);
	r = MulAdd(s1_7, M4(-9.746e-02, 6.679e-02, -1.421e-01, -2.202e-01, -9.918e-03, -2.413e-02, -1.554e-02, 7.011e-03, -3.226e-02, -3.024e-02, -5.431e-02, 7.446e-02, 5.860e-02, 2.851e-02, -2.367e-01, 2.562e-02), r);
	r = MulAdd(s1_8, M4(-4.627e-02, 4.226e-02, -8.654e-02, -3.312e-02, 1.600e-02, 2.983e-02, 8.834e-03, -3.871e-02, -4.137e-03, 1.767e-02, 2.492e-02, -5.391e-02, 8.133e-03, 1.430e-02, -2.428e-02, -1.132e-01), r);
	return r;
}

void Pass4(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 5
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t1
//!OUT OUTPUT

#define l0(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = { 7.204e-05, -6.226e-05, 2.867e-04, -3.251e-05 };
	r = MulAdd(s0_0, M4(-4.783e-03, 7.235e-03, -7.275e-03, -2.802e-03, 4.921e-02, 7.543e-02, -3.357e-02, 1.213e-02, 2.900e-02, 2.380e-03, -9.028e-03, -2.594e-02, 1.576e-03, 3.334e-04, -2.460e-02, -1.285e-02), r);
	r = MulAdd(s0_1, M4(4.582e-02, 9.378e-04, 2.217e-02, 5.083e-02, -1.054e-02, 8.518e-02, -1.884e-02, -5.149e-02, 1.983e-02, -1.106e-02, -4.317e-03, 5.384e-02, -5.193e-02, 1.089e-02, -9.384e-03, 3.137e-02), r);
	r = MulAdd(s0_2, M4(-5.241e-03, 3.821e-02, -1.136e-02, -3.033e-02, 3.186e-02, -3.270e-03, 1.422e-02, 2.401e-02, -1.360e-02, 1.024e-01, -6.042e-02, -2.325e-02, -1.248e-01, -1.377e-01, 1.654e-02, -1.347e-02), r);
	r = MulAdd(s0_3, M4(-3.552e-02, -3.211e-02, -2.282e-03, 1.775e-02, 1.360e-01, 2.808e-02, 1.082e-01, -1.311e-02, -1.699e-02, -2.628e-02, 3.430e-02, -3.880e-03, 2.514e-02, -3.171e-02, 4.675e-02, -2.711e-02), r);
	r = MulAdd(s0_4, M4(4.756e-01, 2.686e-01, 4.514e-02, -8.813e-02, 2.636e-01, -4.893e-01, 1.301e-01, 1.304e-01, 3.778e-01, 2.765e-01, 3.369e-01, 8.811e-02, 5.080e-02, 2.783e-01, -1.131e-01, 2.487e-01), r);
	r = MulAdd(s0_5, M4(-2.961e-02, 7.757e-02, -8.471e-02, -4.636e-02, -6.862e-02, 1.733e-01, -7.301e-02, -1.408e-02, 1.636e-02, 9.982e-02, 5.704e-02, 2.568e-01, -2.224e-02, -2.588e-01, -2.202e-01, -4.898e-01), r);
	r = MulAdd(s0_6, M4(1.058e-01, -2.810e-02, -2.960e-02, -8.398e-02, -9.106e-02, 6.642e-02, -2.574e-02, 7.841e-02, -1.978e-02, -3.700e-02, -1.504e-02, -3.186e-02, 2.438e-03, 6.191e-03, -1.155e-02, -1.161e-02), r);
	r = MulAdd(s0_7, M4(-6.316e-01, -7.748e-02, 8.006e-01, 3.936e-01, 1.300e-01, -1.999e-01, 2.351e-01, -7.485e-01, -7.151e-02, -4.285e-02, -2.277e-02, 2.849e-02, -2.207e-02, -2.585e-02, -2.498e-02, -3.308e-02), r);
	r = MulAdd(s0_8, M4(-2.002e-01, -6.934e-01, -1.093e-01, 3.325e-01, -5.778e-02, 2.138e-02, -2.930e-02, 1.794e-01, -3.028e-03, 2.300e-03, 5.845e-03, -1.959e-02, 1.403e-02, 1.565e-02, 1.840e-02, -6.027e-04), r);
	r = MulAdd(s1_0, M4(2.228e-02, -8.352e-03, -1.007e-02, -1.911e-02, -1.489e-02, 2.785e-03, -9.190e-03, 5.858e-03, 2.420e-02, -7.701e-03, -2.327e-02, -2.494e-02, -8.526e-03, -2.384e-02, -2.601e-02, -4.833e-02), r);
	r = MulAdd(s1_1, M4(5.671e-02, 3.666e-02, 3.309e-02, 1.011e-02, -8.053e-03, 4.673e-02, -5.358e-02, -2.451e-02, 3.779e-01, 5.642e-02, -2.324e-01, -3.499e-02, -3.479e-01, 1.179e-01, -4.630e-02, 1.118e-01), r);
	r = MulAdd(s1_2, M4(-1.650e-02, 6.203e-04, -1.322e-02, -1.996e-02, 2.118e-02, -9.244e-03, 2.813e-02, 9.773e-03, -2.654e-02, -8.373e-02, 6.663e-04, -6.860e-02, -3.436e-02, -7.207e-01, 2.389e-01, 1.903e-01), r);
	r = MulAdd(s1_3, M4(-8.045e-02, -2.073e-02, 3.380e-02, 1.327e-02, 1.247e-01, 1.129e-02, 6.421e-02, -8.326e-03, -4.675e-02, 4.920e-02, -3.699e-02, 4.601e-02, 3.389e-02, -4.151e-02, 3.012e-02, -2.241e-02), r);
	r = MulAdd(s1_4, M4(5.223e-01, 1.394e-01, 1.222e-01, -7.687e-03, -3.115e-01, 3.989e-02, -1.679e-01, 2.607e-01, 4.393e-01, -1.821e-01, 1.006e+00, -2.920e-01, 8.062e-02, 2.231e-01, -1.282e-02, 2.495e-01), r);
	r = MulAdd(s1_5, M4(-1.146e-01, 6.738e-02, -1.655e-02, 1.178e-02, -3.058e-02, 1.093e-01, 9.367e-03, 1.382e-02, -7.397e-02, 2.300e-01, -4.202e-02, 1.765e-01, -4.671e-02, -1.375e-02, -3.662e-01, -5.254e-01), r);
	r = MulAdd(s1_6, M4(5.090e-02, 8.633e-03, -1.128e-02, -3.186e-02, -6.263e-02, 4.143e-02, -2.214e-02, 5.270e-02, -1.370e-02, -1.692e-02, -2.644e-02, -9.847e-03, -2.147e-03, -7.941e-03, -1.323e-04, -5.173e-03), r);
	r = MulAdd(s1_7, M4(-9.353e-02, 6.696e-02, 2.744e-01, 2.743e-01, 9.809e-02, -1.439e-01, -2.583e-02, -3.717e-01, -5.135e-02, -1.889e-02, -1.775e-02, 9.383e-03, -2.496e-02, -2.936e-02, -2.578e-02, -1.586e-02), r);
	r = MulAdd(s1_8, M4(-1.565e-02, -1.635e-01, -1.800e-01, -2.607e-01, 1.975e-02, 1.594e-02, -4.568e-02, 1.218e-01, -6.668e-03, 7.923e-03, -4.625e-02, 1.324e-02, -6.838e-03, 2.045e-02, 1.141e-02, 2.717e-02), r);
	return tanh(r);
}

void Pass5(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
	uint2 size = GetOutputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = ((gxy >> 1) + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);

	static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
	static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
	float2 opt = float2(GetOutputPt());

	pos -= 0.5f * opt;
	MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);

	++gxy.x;
	pos.x += opt.x;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);

	++gxy.y;
	pos.y += opt.y;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);

	--gxy.x;
	pos.x -= opt.x;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
}
